Image Clustering

Loading the Libraries

In [178]:
%matplotlib inline
In [239]:
import time
import os, os.path
import random
import cv2
import glob
import keras
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import sys
import pandas as pd
import numpy as np
import pickle
from PIL import Image
import shutil
In [180]:
def create_classwise_distribution(dataset_filename):
    class_names = os.listdir(dataset_filename)
    data = list()
    for class_name in class_names:
        file_names = os.listdir(os.path.join(dataset_filename,class_name))
        for file in file_names:
            data.append({
                "FileName":os.path.join(dataset_filename,class_name,file),
                "ClassName":class_name,
                "Number_of_Samples":len(file_names)
            })
    data = pd.DataFrame(data)
    if not os.path.exists(os.path.join(os.getcwd(),"csv_files")):
        os.makedirs(os.path.join(os.getcwd(),"csv_files"))
    data.to_csv(os.path.join(os.getcwd(),"csv_files","class_wise_distribution.csv"),index=False)
    

# data_path = os.path.join(os.getcwd(),"data")
data_path = "D:/Content_Classification/Data"
create_classwise_distribution(data_path)
class_names = os.listdir(data_path)

data = pd.read_csv(os.path.join(os.getcwd(),"csv_files","class_wise_distribution.csv"))
data.head()
Out[180]:
FileName ClassName Number_of_Samples
0 D:/Content_Classification/Data\Aadhar\00_0.jpg Aadhar 91
1 D:/Content_Classification/Data\Aadhar\20181003... Aadhar 91
2 D:/Content_Classification/Data\Aadhar\20181003... Aadhar 91
3 D:/Content_Classification/Data\Aadhar\20181003... Aadhar 91
4 D:/Content_Classification/Data\Aadhar\20181003... Aadhar 91
In [183]:
print(data.shape[0])
print("The number of total files ",data.shape[0])
1532
The number of total files  1532
In [184]:
data_classwise = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_classwise.csv"))
data_classwise = data_classwise[data_classwise['class_name']!="Data"]
data_classwise.to_csv(os.path.join(os.getcwd(),"csv_files","data_classwise.csv"),index = False)
data_classwise
Out[184]:
class_name num_files
0 Aadhar 91.0
1 Airtel Mobile Bill 43.0
2 Bank Statement 9.0
3 Credit Card Bills 18.0
4 DubaiID 86.0
5 Electricity Bill 32.0
6 Floor Plan 27.0
7 Form 1040 100.0
8 Form 2106 100.0
9 Form 2441 100.0
10 Form 6251 100.0
11 IGL 20.0
12 Insurance 36.0
13 Medical Report 12.0
14 NewgenIDs 108.0
15 NewgenVisitingCard 159.0
16 PAN 179.0
17 Passport 192.0
18 Resume 120.0
In [185]:
data.shape
Out[185]:
(1532, 3)

Creating the minimum threshold = 50 images

In [186]:
deficient_class_list = list()
for i in range(data_classwise.shape[0]):
    if data_classwise.iloc[i,1]<50:
        deficient_class_list.append(data_classwise.iloc[i,0])
print(deficient_class_list)
['Airtel Mobile Bill', 'Bank Statement', 'Credit Card Bills', 'Electricity Bill', 'Floor Plan', 'IGL', 'Insurance', 'Medical Report']

REMOVING THE CLASS WITH LESS THAN THRESHOLD IMAGES

In [188]:
for class_name in deficient_class_list:
    data = data[data['ClassName']!=class_name]
    data_classwise = data_classwise[data_classwise['class_name']!=class_name]
    
print("The updated number of data files left",data.shape[0])
data.to_csv(os.path.join(os.getcwd(),"csv_files","filepath_truncated.csv"),index=False)
data = pd.read_csv(os.path.join(os.getcwd(),"csv_files","filepath_truncated.csv"))
print("The number of classes to be considered for first iteration",data_classwise.shape[0])
The updated number of data files left 1335
The number of classes to be considered for first iteration 11
In [189]:
data_classwise
Out[189]:
class_name num_files
0 Aadhar 91.0
4 DubaiID 86.0
7 Form 1040 100.0
8 Form 2106 100.0
9 Form 2441 100.0
10 Form 6251 100.0
14 NewgenIDs 108.0
15 NewgenVisitingCard 159.0
16 PAN 179.0
17 Passport 192.0
18 Resume 120.0

IMAGE PREPROCESSING

Loading the images

Now we create a function that loads all images in a directory for a given array of codes in one array and creates the corresponding label array for them.

Loaded images are resized to 224 x 224 before storing them in our array since this is the size preferred by VGG19 which we will be using later.

In [190]:
# Function returns an array of images whoose filenames start with a given set of characters after resizing them to 224 x 224
from tqdm import tqdm_notebook as tqdm
count = 0
def load_images(data):
    images = []
    labels = []
    filename = []
    for i in tqdm(range(data.shape[0])):
        try:
            class_name = data.loc[i,"ClassName"]
            file_path = data.loc[i,"FileName"]
            image = cv2.imread(file_path)
            image = cv2.resize(image,(224,224))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            images.append(image)
            labels.append(class_name)
            filename.append(file_path)
        except Exception as e:
            print(str(e))
            print("Raising an exception")
            count+=1
    return images,labels,filename
images,labels,filenames = load_images(data)
print(count)
0

Pickling the image,labels,filenames

As the previous execution was very much time consuming so to avoid running the same iterations again and again,we will save the image,labels,filenames in the pickle files

In [191]:
if not os.path.exists(os.path.join(os.getcwd(),"pickle_files")):
    os.makedirs(os.path.join(os.getcwd(),"pickle_files"))
with open(os.path.join(os.getcwd(),"pickle_files","images_list.pkl"), "wb") as fp:   #Pickling
    pickle.dump(images, fp)
with open(os.path.join(os.getcwd(),"pickle_files","labels_list.pkl"), "wb") as fp:   
    pickle.dump(labels, fp)
with open(os.path.join(os.getcwd(),"pickle_files","filenames_list.pkl"), "wb") as fp:   
    pickle.dump(filenames, fp)
In [192]:
with open(os.path.join(os.getcwd(),"pickle_files","images_list.pkl"), "rb") as fp:   # Unpickling
    images = pickle.load(fp)
with open(os.path.join(os.getcwd(),"pickle_files","labels_list.pkl"), "rb") as fp:
    labels = pickle.load(fp)
with open(os.path.join(os.getcwd(),"pickle_files","filenames_list.pkl"), "rb") as fp:
    filenames = pickle.load(fp)

Show the Images

In [193]:
def show_random_images(images, labels, number_of_images_to_show=2):

    for labels_iter in list(set(labels)):

        indicies = [i for i, label in enumerate(labels) if label == labels_iter]
        random_indicies = [random.choice(indicies) for i in range(number_of_images_to_show)]
        figure, axis = plt.subplots(1, number_of_images_to_show)

        print("{} random images for code {}".format(number_of_images_to_show, labels_iter))

        for image in range(number_of_images_to_show):
            axis[image].imshow(images[random_indicies[image]])
        plt.show()
In [194]:
show_random_images(images, labels)
2 random images for code Resume
2 random images for code Aadhar
2 random images for code DubaiID
2 random images for code NewgenIDs
2 random images for code NewgenVisitingCard
2 random images for code Form 2106
2 random images for code Form 2441
2 random images for code Form 6251
2 random images for code Passport
2 random images for code Form 1040
2 random images for code PAN

Normalise...

We now convert the images and labels to NumPy arrays to make processing them easier. We then normaise the images before passing them on to VGG19

In [195]:
def normalise_images(images, labels):

    # Convert to numpy arrays
    images = np.array(images, dtype=np.float32)
    labels = np.array(labels)

    # Normalise the images
    images /= 255
    
    return images, labels
In [196]:
images, labels = normalise_images(images, labels)
In [197]:
X_train, y_train,filenames = images,labels,filenames

Load pre-trained covnet models

VGG16, VG19, ResNet50

We'll now load up the keras models with the imagenet weights. We'll remove the top dense layers, since we won't need to classify things here, and we just want these encoded features from the images.

In [24]:
# Load the models with ImageNet weights

# vgg16_model = keras.applications.vgg16.VGG16(include_top=False, weights="imagenet", input_shape=(224,224,3))

vgg19_model = keras.applications.vgg19.VGG19(include_top=False, weights="imagenet", input_shape=(224,224,3))

# resnet50_model = keras.applications.resnet50.ResNet50(include_top=False, weights="imagenet", input_shape=(224,224,3))
In [25]:
def covnet_transform(covnet_model, raw_images):

    # Pass our training data through the network
    pred = covnet_model.predict(raw_images)

    # Flatten the array
    flat = pred.reshape(raw_images.shape[0], -1)
    
    return flat
In [26]:
# vgg16_output = covnet_transform(vgg16_model, X_train)
# print("VGG16 flattened output has {} features".format(vgg16_output.shape[1]))

vgg19_output = covnet_transform(vgg19_model, X_train)
print("VGG19 flattened output has {} features".format(vgg19_output.shape[1]))

# resnet50_output = covnet_transform(resnet50_model, X_train)
# print("ResNet50 flattened output has {} features".format(resnet50_output.shape[1]))
VGG19 flattened output has 25088 features

The above cell shows us the number of features each covnet gives to a single image. When we compare these to the original size of the image 224 x 224 x 3 = 150,528 pixels/features, we can see that this is a large reduction in what the clustering algorithms will have to work with.

PCA

We need to do Dimensionality reduction as Kmeans is able to produce the results with these large dimensions but not GMM

In [198]:
# Function that creates a PCA instance, fits it to the data and returns the instance
def create_fit_PCA(data, n_components=None):
    
    p = PCA(n_components=n_components, random_state=728)
    p.fit(data)
    
    return p
    
In [199]:
# Create PCA instances for each covnet output
# vgg16_pca = create_fit_PCA(vgg16_output)
vgg19_pca = create_fit_PCA(vgg19_output)
# resnet50_pca = create_fit_PCA(resnet50_output)

DETERMINING NUMBER OF CLUSTERS

In [200]:
# Function to plot the cumulative explained variance of PCA components
# This will help us decide how many components we should reduce our features to
def pca_cumsum_plot(pca,title_name):
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.title(title_name)
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance')
    plt.show()
In [201]:
# Plot the cumulative explained variance for each covnet
# pca_cumsum_plot(vgg16_pca,title_name = "VGG16")
pca_cumsum_plot(vgg19_pca,title_name = "VGG19")
# pca_cumsum_plot(resnet50_pca,title_name = "RESNET50")

Looking at the gaphs above, we can see that PCA can explain almost all the variance in as many dimensions as there are samples.

It is also interesting to note the difference in shape between the VGG graphs and the ResNet one. This is probably due to the fact that ResNet only had 2048 dimensions to start with, while VGGs had 25,088

TRAINING -KMEANS WITH VGG19

In [202]:
# PCA transformations of covnet outputs
# vgg16_output_pca = vgg16_pca.transform(vgg16_output)
vgg19_output_pca = vgg19_pca.transform(vgg19_output)
# resnet50_output_pca = resnet50_pca.transform(resnet50_output)
In [204]:
def create_train_kmeans(data, number_of_clusters=data_classwise.shape[0]):
    # n_jobs is set to -1 to use all available CPU cores. This makes a big difference on an 8-core CPU
    # especially when the data size gets much bigger. #perfMatters
    
    k = KMeans(n_clusters=number_of_clusters, n_jobs=-1, random_state=728)

    # Let's do some timings to see how long it takes to train.
    start = time.time()

    # Train it up
    k.fit(data)

    # Stop the timing 
    end = time.time()

    # And see how long that took
    print("Training took {} seconds".format(end-start))
    
    return k
In [205]:
# def create_train_gmm(data, number_of_clusters=len(os.listdir(data_path))):
#     g = GaussianMixture(n_components=number_of_clusters, covariance_type="full", random_state=728)
    
#     start=time.time()
#     g.fit(data)
#     end=time.time()
    
#     print("Training took {} seconds".format(end-start))
    
#     return g
    
In [206]:
print("KMeans (PCA): \n")
print("\nVGG19")
K_vgg19_pca = create_train_kmeans(vgg19_output_pca)
KMeans (PCA): 


VGG19
Training took 31.696382761001587 seconds
In [207]:
print("KMeans: \n")

print("\nVGG19:")
K_vgg19 = create_train_kmeans(vgg19_output)
KMeans: 


VGG19:
Training took 23.31485915184021 seconds
In [208]:
k_vgg19_pred_pca = K_vgg19_pca.predict(vgg19_output_pca)
k_vgg19_pred = K_vgg19.predict(vgg19_output)

Remember that the clustering algorith does not detect which images are cats and which are dogs, it only groups images that look alike together and assigns them a number arbitrarily.

We now need to count how many of each label are in each cluster, this way we can take a look and if sufficient eperation has happened we can quicly see which cluster is which label. So let's write a function that does that.

In [209]:
def cluster_label_count(clusters, labels):
    
    count = {}
    
    # Get unique clusters and labels
    unique_clusters = list(set(clusters))
    unique_labels = list(set(labels))
    
    # Create counter for each cluster/label combination and set it to 0
    for cluster in unique_clusters:
        count[cluster] = {}
        
        for label in unique_labels:
            count[cluster][label] = 0
    
    # Let's count
    for i in range(len(clusters)):
        count[clusters[i]][labels[i]] +=1
    
    cluster_df = pd.DataFrame(count)
    
    return cluster_df
In [210]:
# VGG19 KMeans
vgg19_cluster_count = cluster_label_count(k_vgg19_pred, y_train)
vgg19_cluster_count_pca = cluster_label_count(k_vgg19_pred_pca, y_train)

DATA DISTRIBUTION MATRIX

In [212]:
# print("KMeans VGG19: ")
# vgg19_cluster_count.to_csv('cluster"s_result.csv',index = False)
vgg19_cluster_count
Out[212]:
0 1 2 3 4 5 6 7 8 9 10
Resume 0 0 0 0 0 0 120 0 0 0 0
Aadhar 0 90 0 0 0 1 0 0 0 0 0
DubaiID 0 20 0 4 0 3 0 0 59 0 0
NewgenIDs 0 0 0 0 0 0 0 0 0 108 0
NewgenVisitingCard 0 2 0 0 0 156 0 1 0 0 0
Form 2106 0 0 0 0 0 0 0 0 0 0 100
Form 2441 0 0 0 0 100 0 0 0 0 0 0
Form 6251 0 0 100 0 0 0 0 0 0 0 0
Passport 0 41 0 0 0 2 0 149 0 0 0
Form 1040 100 0 0 0 0 0 0 0 0 0 0
PAN 0 1 0 141 0 37 0 0 0 0 0
In [213]:
# np.sum(vgg19_cluster_count.iloc[0,:])
# clusters_result = pd.read_csv('clusters_result.csv')
# clusters_result

DATA DISTRIBUTION MATRIX -WITH PCA

In [214]:
print("KMeans VGG19 (PCA): ")
vgg19_cluster_count_pca
KMeans VGG19 (PCA): 
Out[214]:
0 1 2 3 4 5 6 7 8 9 10
Resume 0 0 0 0 0 0 120 0 0 0 0
Aadhar 0 90 0 0 0 1 0 0 0 0 0
DubaiID 0 20 0 4 0 3 0 0 59 0 0
NewgenIDs 0 0 0 0 0 0 0 0 0 108 0
NewgenVisitingCard 0 2 0 0 0 156 0 1 0 0 0
Form 2106 0 0 0 0 0 0 0 0 0 0 100
Form 2441 0 0 0 0 100 0 0 0 0 0 0
Form 6251 0 0 100 0 0 0 0 0 0 0 0
Passport 0 41 0 0 0 2 0 149 0 0 0
Form 1040 100 0 0 0 0 0 0 0 0 0 0
PAN 0 1 0 141 0 37 0 0 0 0 0

RELATIVE PURITY MATRIX

Relative purity of a cluster is defined as

relative purity of cluster 1 w.r.t class_a = (number of samples of class_a in the cluster 1/total number of samples in cluster 1)*100

which implies that 100% value indicates that the cluster does not contain samples from any other class

We set the benchmark for purity of a cluster a relative percentage to be minimum 90 for first iteration to be identified as a pure cluster

In [215]:
data_relative_purity = vgg19_cluster_count.copy()

relative_purity = list()
for i in range(vgg19_cluster_count.shape[0]):
    for j in range(vgg19_cluster_count.shape[0]):
        data_relative_purity.iloc[i,j] = np.round(data_relative_purity.iloc[i,j]/np.sum(data_relative_purity.iloc[:,j]),2)*100

data_relative_purity.to_csv(os.path.join(os.getcwd(),"csv_files","relative_purity.csv"),index=True)
data_relative_purity
Out[215]:
0 1 2 3 4 5 6 7 8 9 10
Resume 0.0 0.0 0.0 0.0 0.0 0.0 100.0 0.0 0.0 0.0 0.0
Aadhar 0.0 58.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
DubaiID 0.0 16.0 0.0 3.0 0.0 2.0 0.0 0.0 100.0 0.0 0.0
NewgenIDs 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 100.0 0.0
NewgenVisitingCard 0.0 2.0 0.0 0.0 0.0 79.0 0.0 1.0 0.0 0.0 0.0
Form 2106 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 100.0
Form 2441 0.0 0.0 0.0 0.0 100.0 0.0 0.0 0.0 0.0 0.0 0.0
Form 6251 0.0 0.0 100.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Passport 0.0 35.0 0.0 0.0 0.0 2.0 0.0 99.0 0.0 0.0 0.0
Form 1040 100.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
PAN 0.0 1.0 0.0 98.0 0.0 31.0 0.0 0.0 0.0 0.0 0.0
In [216]:
data_relative_purity.shape
Out[216]:
(11, 11)
In [218]:
# data_classwise = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_classwise.csv"))
# data_classwise.index = data_classwise['class_name'].tolist()
In [220]:
# data_classwise.iloc[data_classwise['class_name'].tolist(),1]
In [221]:
# i = 0
# data_relative_purity.index.values[i]

IDENTIFYING PURE CLUSTERS ITERATION-1

In [255]:
data_pure_clusters = list()
for i in range(data_relative_purity.shape[0]):
    for j in range(data_relative_purity.shape[0]):
        if data_relative_purity.iloc[i,j]>90:
            classPercentage = np.round(vgg19_cluster_count.iloc[i,j]/np.sum(vgg19_cluster_count.iloc[i,:]),2)*100
#             print(np.sum(vgg19_cluster_count.iloc[i,:]),2)
#             print(data.index.values[i],end=' ')
#             print("Cluster number-->",i)
#             print(data.iloc[j,i])
    
            data_pure_clusters.append({
                        "ClassName":data_relative_purity.index.values[i],
                        "ClusterNumber":j,
                        "PurityPercentage":data_relative_purity.iloc[i,j],
                        "ClassPercentage":classPercentage
                    })
data_pure_clusters = pd.DataFrame(data_pure_clusters)
data_pure_clusters.to_csv(os.path.join(os.getcwd(),"csv_files","data_pure_clusters.csv"),index=False)
data_pure_classes = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_pure_clusters.csv"))

data_pure_classes
Out[255]:
ClassName ClusterNumber PurityPercentage ClassPercentage
0 Resume 6 100.0 100.0
1 DubaiID 8 100.0 69.0
2 NewgenIDs 9 100.0 100.0
3 Form 2106 10 100.0 100.0
4 Form 2441 4 100.0 100.0
5 Form 6251 2 100.0 100.0
6 Passport 7 99.0 78.0
7 Form 1040 0 100.0 100.0
8 PAN 3 98.0 79.0

VISUALIZING THE INTERMEDIATE PURE CLUSTERS OF A CLASS

In [256]:
vgg19_cluster_count
Out[256]:
0 1 2 3 4 5 6 7 8 9 10
Resume 0 0 0 0 0 0 120 0 0 0 0
Aadhar 0 90 0 0 0 1 0 0 0 0 0
DubaiID 0 20 0 4 0 3 0 0 59 0 0
NewgenIDs 0 0 0 0 0 0 0 0 0 108 0
NewgenVisitingCard 0 2 0 0 0 156 0 1 0 0 0
Form 2106 0 0 0 0 0 0 0 0 0 0 100
Form 2441 0 0 0 0 100 0 0 0 0 0 0
Form 6251 0 0 100 0 0 0 0 0 0 0 0
Passport 0 41 0 0 0 2 0 149 0 0 0
Form 1040 100 0 0 0 0 0 0 0 0 0 0
PAN 0 1 0 141 0 37 0 0 0 0 0

Visualizing the Clusters of class PAN

In [257]:
pure_clusters_list = list()
class_name = "PAN"
clusters_name_list = list()

for i in range(vgg19_cluster_count.shape[0]):
    if vgg19_cluster_count.index[i]==class_name:
            for j in range(vgg19_cluster_count.shape[1]):
                if vgg19_cluster_count.iloc[i,j]>0:
                    clusters_name_list.append(j)

print(clusters_name_list)
pure_clusters_list = clusters_name_list.copy()
[1, 3, 5]
In [258]:
def ClusterIndicesNumpy(clustNum, labels_array): #numpy 
    return np.where(labels_array == clustNum)[0]

# print(ClusterIndicesNumpy(2, K_vgg19.labels_))
# class_index_list = ClusterIndicesNumpy(5, K_vgg19.labels_)
# print(class_index_list)
# print(len(class_index_list))
In [259]:
print(np.unique(K_vgg19.labels_))
pure_files_list = list()
for pure_clusters in pure_clusters_list:
    pure_files_list.append(ClusterIndicesNumpy(pure_clusters, K_vgg19.labels_))

pure_files_list = np.array(pure_files_list)
print(pure_files_list.shape[0])
print(pure_files_list[0].shape[0])
# print(pure_files_list)
[ 0  1  2  3  4  5  6  7  8  9 10]
3
154
In [260]:
pure_files_list.shape[0]
Out[260]:
3
In [261]:
# pure_files_list[0]
In [262]:
main_cluster = list()
# pure_files_list[0]
class_name = "PAN"
for i in range(pure_files_list.shape[0]):
#     cluster_number = pure_clusters_list[i]
    per_cluster = list()
    for j in pure_files_list[i]:
#         print(data.iloc[j,1])
        if data.iloc[j,1]==class_name:
            per_cluster.append(j)
    main_cluster.append(per_cluster)
print(main_cluster)

main_cluster = np.array(main_cluster)
[[860], [844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 861, 862, 863, 864, 865, 866, 869, 870, 874, 877, 879, 881, 883, 885, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 913, 914, 915, 916, 917, 918, 919, 920, 921, 923, 924, 925, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 942, 944, 945, 946, 947, 948, 950, 953, 954, 956, 957, 958, 959, 960, 961, 969, 970, 972, 973, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022], [856, 857, 858, 859, 867, 868, 871, 872, 873, 875, 876, 878, 880, 882, 884, 886, 897, 911, 912, 922, 926, 927, 941, 943, 949, 951, 952, 955, 962, 963, 964, 965, 966, 967, 968, 971, 974]]
In [263]:
print(main_cluster.shape)
(3,)
In [264]:
print(len(main_cluster[0]))
print(len(main_cluster[1]))
print(len(main_cluster[2]))
1
141
37
In [ ]:
 

Copying the Clusters files

In [304]:
if os.path.exists(os.path.join(os.getcwd(),"clusters")):
    shutil.rmtree(os.path.join(os.getcwd(),"clusters"), ignore_errors=False, onerror=None)
for row in range(main_cluster.shape[0]):
    if not os.path.exists(os.path.join(os.getcwd(),"clusters","pan"+str(row))):
        os.makedirs(os.path.join(os.getcwd(),"clusters","pan"+str(row)))
    for column in range(len(main_cluster[row])):
#         print(column)
#         print(row,column)
        shutil.copy(data.iloc[main_cluster[row][column],0],os.path.join(os.getcwd(),"clusters","pan"+str(row)))
In [305]:
from PIL import Image
Image.open(data.iloc[main_cluster[0][0],0])
Out[305]:
In [306]:
Image.open(data.iloc[main_cluster[1][0],0])
Out[306]:
In [307]:
Image.open(data.iloc[main_cluster[2][0],0])
Out[307]:

Making the folders

We will be making the folders for each clusters formed and their associated classes

In [308]:
cluster_report = vgg19_cluster_count.copy()
cluster_report
Out[308]:
0 1 2 3 4 5 6 7 8 9 10
Resume 0 0 0 0 0 0 120 0 0 0 0
Aadhar 0 90 0 0 0 1 0 0 0 0 0
DubaiID 0 20 0 4 0 3 0 0 59 0 0
NewgenIDs 0 0 0 0 0 0 0 0 0 108 0
NewgenVisitingCard 0 2 0 0 0 156 0 1 0 0 0
Form 2106 0 0 0 0 0 0 0 0 0 0 100
Form 2441 0 0 0 0 100 0 0 0 0 0 0
Form 6251 0 0 100 0 0 0 0 0 0 0 0
Passport 0 41 0 0 0 2 0 149 0 0 0
Form 1040 100 0 0 0 0 0 0 0 0 0 0
PAN 0 1 0 141 0 37 0 0 0 0 0
In [309]:
def ClusterIndicesNumpy(clustNum, labels_array): #numpy 
    return np.where(labels_array == clustNum)[0]

for i in range(cluster_report.shape[0]):
    class_index_list = ClusterIndicesNumpy(i,K_vgg19.labels_)
    for j in class_index_list:
        if not os.path.exists(os.path.join(os.getcwd(),"clusters",str(i),data.iloc[j,1])):
            os.makedirs(os.path.join(os.getcwd(),"clusters",str(i),data.iloc[j,1]))
        shutil.copy(data.iloc[j,0],os.path.join(os.getcwd(),"clusters",str(i),data.iloc[j,1]))

OBSERVATION

The above tabel contains the pure clusters with the class percentage

where class percentage is defined as

class percentage = (number of samples of class in that particular cluster)/(total number of samples of that class)

We observe that we find 13 pure clusters out of 18 initial clusters initially formed

Finding the Correctly identified classes

we create a benchmark for correctly identified classes as classpercentage > 80.0%

First we will club all the pure clusters of the classes

if their sum >80.0 then the class is most suitable for image based clustering

In [310]:
classPercentageDict = dict()
for iterator in range(data_pure_classes.shape[0]):
    try:
        classPercentageDict[data_pure_classes.iloc[iterator,0]]+=data_pure_classes.iloc[iterator,3]
    except:
        classPercentageDict[data_pure_classes.iloc[iterator,0]] = data_pure_classes.iloc[iterator,3]
In [311]:
classPercentageDict
Out[311]:
{'Resume': 100.0,
 'DubaiID': 69.0,
 'NewgenIDs': 100.0,
 'Form 2106': 100.0,
 'Form 2441': 100.0,
 'Form 6251': 100.0,
 'Passport': 78.0,
 'Form 1040': 100.0,
 'PAN': 79.0}
In [312]:
# data_classwise = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_classwise.csv"))
# data_classwise
In [313]:
# class_name = "Form 2106"
# print(data_classwise['class_name'].tolist().index(class_name))
In [314]:
# type(data_classwise.iloc[data_classwise['class_name'].tolist().index(class_name),1])
In [315]:
pure_class_report = list()
for class_name,class_percentage in classPercentageDict.items():
    try:
        correctlyClassifiedFiles = int((class_percentage/100)*data_classwise.iloc[data_classwise['class_name'].tolist().index(class_name),1])
        totalFiles = data_classwise.iloc[data_classwise['class_name'].tolist().index(class_name),1]
        pure_class_report.append(
        {
            "className":class_name,
            "ClassPercentage":class_percentage,
            "CorrectlyClassifiedFiles":correctlyClassifiedFiles,
            "TotalFiles":totalFiles
        })
    except Exception as e:
        print(str(e))
pure_class_report = pd.DataFrame(pure_class_report)
pure_class_report.to_csv(os.path.join(os.getcwd(),"csv_files","data_correctly_separated_first_iteration.csv"),index = False)
In [316]:
pure_class_report
Out[316]:
className ClassPercentage CorrectlyClassifiedFiles TotalFiles
0 Resume 100.0 120 120.0
1 DubaiID 69.0 59 86.0
2 NewgenIDs 100.0 108 108.0
3 Form 2106 100.0 100 100.0
4 Form 2441 100.0 100 100.0
5 Form 6251 100.0 100 100.0
6 Passport 78.0 149 192.0
7 Form 1040 100.0 100 100.0
8 PAN 79.0 141 179.0
In [317]:
class_lis = list()
for class_name,class_percentage in classPercentageDict.items():
    if class_percentage > 80.0:
        class_lis.append(class_name)
print(class_lis)
['Resume', 'NewgenIDs', 'Form 2106', 'Form 2441', 'Form 6251', 'Form 1040']

Correctly identified class

['Resume', 'NewgenIDs', 'Form 2106', 'Form 2441', 'Form 6251', 'Form 1040']

Six classes out of Eleven have been correctly identified

Finding impure clusters files

we will now consider all the files that belongs to the impure clusters and create the clusters of those files separately with more clusters as they might have got impure due to insufficient number of clusters

In [318]:
impure_clusters_list = list(set(range(data_classwise.shape[0]))-set(data_pure_clusters['ClusterNumber'].tolist()))
print(impure_clusters_list)
[1, 5]

SELECTING THE FILES FORMING THE PART OF IMPURE CLUSTERS

In [319]:
def ClusterIndicesNumpy(clustNum, labels_array): #numpy 
    return np.where(labels_array == clustNum)[0]

def ClusterIndicesComp(clustNum, labels_array): #list comprehension
    return np.array([i for i, x in enumerate(labels_array) if x == clustNum])

# print(ClusterIndicesNumpy(2, K_vgg19.labels_))
# class_index_list = ClusterIndicesNumpy(2, K_vgg19.labels_)

impure_files_list = list()
for impure_clusters in impure_clusters_list:
    impure_files_list.extend(ClusterIndicesNumpy(impure_clusters, K_vgg19.labels_))
# impure_class_files = list()
# for cluster_number in impure_clusters_list:
#         impure_class_files.append(
#                     ""
#         )
# print(impure_files_list)
print("Number of impure files left ",len(impure_files_list))
Number of impure files left  353
In [320]:
print(len(impure_files_list))
353
In [321]:
data_impure = list()
for impure_files in impure_files_list:
    data_impure.append({
        "Filepath":data.iloc[impure_files,0],
        "ClassName":data.iloc[impure_files,1]
    })
data_impure = pd.DataFrame(data_impure)

REMOVING THE FILES OF THE CORRECTLY IDENTIFIED CLASSES

In [322]:
for class_name in class_lis:
    data_impure = data_impure[data_impure['ClassName']!=class_name]
In [323]:
data_impure.to_csv(os.path.join(os.getcwd(),"csv_files","data_impure.csv"),index=False)
print(data_impure.shape)
(353, 2)
In [324]:
data_impure.head()
Out[324]:
Filepath ClassName
0 D:/Content_Classification/Data\Aadhar\00_0.jpg Aadhar
1 D:/Content_Classification/Data\Aadhar\20181003... Aadhar
2 D:/Content_Classification/Data\Aadhar\20181003... Aadhar
3 D:/Content_Classification/Data\Aadhar\20181003... Aadhar
4 D:/Content_Classification/Data\Aadhar\20181003... Aadhar
In [325]:
data_impure.groupby('ClassName').size()
Out[325]:
ClassName
Aadhar                 91
DubaiID                23
NewgenVisitingCard    158
PAN                    38
Passport               43
dtype: int64
In [326]:
# list(set(data_impure['ClassName'].tolist()))
In [327]:
# data_impure = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_impure.csv"))
# data_impure.shape
In [328]:
# data_impure.head(10)
In [329]:
# print(class_lis)
In [330]:
# data_impure.groupby('ClassName').size()
In [331]:
# print(class_lis)
In [332]:
# data_impure.to_csv(os.path.join(os.getcwd(),"csv_files","data_impure_refined.csv"),index=False)
In [333]:
with open(os.path.join(os.getcwd(),"pickle_files","class_removed_list_one.pkl"), "wb") as fp:   #Pickling
    pickle.dump(class_lis, fp)    
In [334]:
# data_impure.shape
In [292]:
# data_original = pd.read_csv(os.path.join(os.getcwd(),"csv_files","class_wise_distribution.csv"))
In [293]:
# for class_remove in class_lis:
#     data_original = data_original[data_original['ClassName']!=class_remove]
In [294]:
# data_original.to_csv(os.path.join(os.getcwd(),"csv_files","iteration_two.csv"))
In [295]:
# data_original.shape
In [296]:
# separated_class_one = list();clusters_remove_lis = list();class_remove_lis = list()
# for i in range(data_pure_classes.shape[0]):
#     if data_pure_classes.iloc[i,3]>=85:
#         separated_class_one.append(data_pure_classes.iloc[i,0])
#         clusters_remove_lis.append(data_pure_classes.iloc[i,1])
#         class_remove_lis.append(data_pure_classes.iloc[i,0])
# print(separated_class_one)
In [297]:
# data_iter_a.index.values[0]
In [298]:
# print(clusters_remove_lis)
# print(class_remove_lis)
In [299]:
# data_iter_a  = data_relative_purity
In [300]:
# data_iter_a
In [301]:
# data_iter_a = data.drop(class_remove_lis,axis=0)
# data_iter_a = data_iter_a.drop(clusters_remove_lis,axis=1)
# data_iter_a
In [302]:
# data_iter_a.shape[0]
# data_iter_a.shape[1]
In [303]:
# row_list = list()
# for i in range(data_iter_a.shape[0]):
#     flag = 0
#     for j in range(data_iter_a.shape[1]):
#         if data_iter_a.iloc[i,j]!=0:
#             flag=1
#             print(data_iter_a.index.values[i])
#             break;
#     if not flag:
# #         data_iter_a = data_iter_a.drop(data_iter_a.index.values[i],axis=0)
#         row_list.append(data_iter_a.index.values[i])
# data_iter_a = data_iter_a.drop(row_list,axis=0)
# data_iter_a.to_csv("iteration_one.csv",index=False)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: